Clean up and fix VCPU hotplug and SMP save/restore.
authorkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Tue, 28 Mar 2006 17:43:30 +0000 (18:43 +0100)
committerkaf24@firebug.cl.cam.ac.uk <kaf24@firebug.cl.cam.ac.uk>
Tue, 28 Mar 2006 17:43:30 +0000 (18:43 +0100)
 1. No longer hold xenbus_lock while taking down VCPUs
    in SMP suspend path. This allows block device hotplug
    to continue working and so we will not deadlock on
    paging in userspace hotplug code.
 2. Track xenbus and local-admin permitted cpumasks for
    VCPUs to bring online. So, if a local admin takes a
    CPU down, that won't surprisingly get overridden next
    time the kernel interrogates xenstore.

Signed-off-by: Keir Fraser <keir@xensource.com>
linux-2.6-xen-sparse/drivers/xen/core/reboot.c
linux-2.6-xen-sparse/drivers/xen/core/smpboot.c
linux-2.6-xen-sparse/drivers/xen/pciback/xenbus.c
linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c
linux-2.6-xen-sparse/include/xen/gnttab.h
linux-2.6-xen-sparse/include/xen/xenbus.h

index e63e40465dbb0df5abcf8f86a466bf703091b4d0..f0f1d803a56abb4b33c3558271f974676e05b1ab 100644 (file)
@@ -15,6 +15,7 @@
 #include <xen/xenbus.h>
 #include <linux/cpu.h>
 #include <linux/kthread.h>
+#include <xen/gnttab.h>
 #include <xen/xencons.h>
 
 #if defined(__i386__) || defined(__x86_64__)
@@ -76,31 +77,24 @@ static int shutting_down = SHUTDOWN_INVALID;
 static void __shutdown_handler(void *unused);
 static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
 
-#ifndef CONFIG_HOTPLUG_CPU
-#define cpu_down(x) (-EOPNOTSUPP)
-#define cpu_up(x) (-EOPNOTSUPP)
+#ifdef CONFIG_SMP
+int  smp_suspend(void);
+void smp_resume(void);
+#else
+#define smp_suspend()  (0)
+#define smp_resume()   ((void)0)
 #endif
 
-
 static int __do_suspend(void *ignore)
 {
-       int i, j, k, fpp;
+       int i, j, k, fpp, err;
 
        extern unsigned long max_pfn;
        extern unsigned long *pfn_to_mfn_frame_list_list;
        extern unsigned long *pfn_to_mfn_frame_list[];
 
-       extern int gnttab_suspend(void);
-       extern int gnttab_resume(void);
        extern void time_resume(void);
 
-#ifdef CONFIG_SMP
-       cpumask_t prev_online_cpus;
-       int vcpu_prepare(int vcpu);
-#endif
-
-       int err = 0;
-
        BUG_ON(smp_processor_id() != 0);
        BUG_ON(in_interrupt());
 
@@ -110,40 +104,12 @@ static int __do_suspend(void *ignore)
                return -EOPNOTSUPP;
        }
 
-#if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
-       if (num_online_cpus() > 1) {
-               printk(KERN_WARNING "Can't suspend SMP guests "
-                      "without CONFIG_HOTPLUG_CPU\n");
-               return -EOPNOTSUPP;
-       }
-#endif
+       err = smp_suspend();
+       if (err)
+               return err;
 
        xenbus_suspend();
 
-       lock_cpu_hotplug();
-#ifdef CONFIG_SMP
-       /*
-        * Take all other CPUs offline. We hold the hotplug mutex to
-        * avoid other processes bringing up CPUs under our feet.
-        */
-       cpus_clear(prev_online_cpus);
-       while (num_online_cpus() > 1) {
-               for_each_online_cpu(i) {
-                       if (i == 0)
-                               continue;
-                       unlock_cpu_hotplug();
-                       err = cpu_down(i);
-                       lock_cpu_hotplug();
-                       if (err != 0) {
-                               printk(KERN_CRIT "Failed to take all CPUs "
-                                      "down: %d.\n", err);
-                               goto out_reenable_cpus;
-                       }
-                       cpu_set(i, prev_online_cpus);
-               }
-       }
-#endif
-
        preempt_disable();
 
 #ifdef __i386__
@@ -153,7 +119,6 @@ static int __do_suspend(void *ignore)
 
        __cli();
        preempt_enable();
-       unlock_cpu_hotplug();
 
        gnttab_suspend();
 
@@ -203,30 +168,9 @@ static int __do_suspend(void *ignore)
 
        xencons_resume();
 
-#ifdef CONFIG_SMP
-       for_each_cpu(i)
-               vcpu_prepare(i);
-
-#endif
-
-       /*
-        * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
-        * the VCPU hotplug callback can race with our vcpu_prepare
-        */
        xenbus_resume();
 
-#ifdef CONFIG_SMP
- out_reenable_cpus:
-       for_each_cpu_mask(i, prev_online_cpus) {
-               j = cpu_up(i);
-               if ((j != 0) && !cpu_online(i)) {
-                       printk(KERN_CRIT "Failed to bring cpu "
-                              "%d back up (%d).\n",
-                              i, j);
-                       err = j;
-               }
-       }
-#endif
+       smp_resume();
 
        return err;
 }
@@ -334,7 +278,6 @@ static void shutdown_handler(struct xenbus_watch *watch,
        kfree(str);
 }
 
-#ifdef CONFIG_MAGIC_SYSRQ
 static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
                          unsigned int len)
 {
@@ -360,45 +303,35 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
        if (err == -EAGAIN)
                goto again;
 
-       if (sysrq_key != '\0') {
+#ifdef CONFIG_MAGIC_SYSRQ
+       if (sysrq_key != '\0')
                handle_sysrq(sysrq_key, NULL, NULL);
-       }
-}
 #endif
+}
 
 static struct xenbus_watch shutdown_watch = {
        .node = "control/shutdown",
        .callback = shutdown_handler
 };
 
-#ifdef CONFIG_MAGIC_SYSRQ
 static struct xenbus_watch sysrq_watch = {
        .node ="control/sysrq",
        .callback = sysrq_handler
 };
-#endif
 
 static int setup_shutdown_watcher(struct notifier_block *notifier,
                                   unsigned long event,
                                   void *data)
 {
-       int err1 = 0;
-#ifdef CONFIG_MAGIC_SYSRQ
-       int err2 = 0;
-#endif
-
-       err1 = register_xenbus_watch(&shutdown_watch);
-#ifdef CONFIG_MAGIC_SYSRQ
-       err2 = register_xenbus_watch(&sysrq_watch);
-#endif
+       int err;
 
-       if (err1)
+       err = register_xenbus_watch(&shutdown_watch);
+       if (err)
                printk(KERN_ERR "Failed to set shutdown watcher\n");
 
-#ifdef CONFIG_MAGIC_SYSRQ
-       if (err2)
+       err = register_xenbus_watch(&sysrq_watch);
+       if (err)
                printk(KERN_ERR "Failed to set sysrq watcher\n");
-#endif
 
        return NOTIFY_DONE;
 }
index fb0b822b5cbb0be9fd4b0e658601114f721ff03a..1f588c59eec26524d13f016ee656cdd22f27e2f6 100644 (file)
@@ -79,6 +79,15 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
 unsigned int maxcpus = NR_CPUS;
 #endif
 
+/*
+ * Set of CPUs that remote admin software will allow us to bring online.
+ * Notified to us via xenbus.
+ */
+static cpumask_t xenbus_allowed_cpumask;
+
+/* Set of CPUs that local admin will allow us to bring online. */
+static cpumask_t local_allowed_cpumask = CPU_MASK_ALL;
+
 void __init prefill_possible_map(void)
 {
        int i, rc;
@@ -146,7 +155,7 @@ static void cpu_bringup(void)
        cpu_idle();
 }
 
-void vcpu_prepare(int vcpu)
+static void vcpu_prepare(int vcpu)
 {
        vcpu_guest_context_t ctxt;
        struct task_struct *idle = idle_task(vcpu);
@@ -278,6 +287,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
                vcpu_prepare(cpu);
        }
 
+       xenbus_allowed_cpumask = cpu_present_map;
+
        /* Currently, Xen gives no dynamic NUMA/HT info. */
        for (cpu = 1; cpu < NR_CPUS; cpu++) {
                cpu_sibling_map[cpu] = cpumask_of_cpu(cpu);
@@ -301,6 +312,15 @@ void __devinit smp_prepare_boot_cpu(void)
        cpu_online_map   = cpumask_of_cpu(0);
 }
 
+static int local_cpu_hotplug_request(void)
+{
+       /*
+        * We assume a CPU hotplug request comes from local admin if it is made
+        * via a userspace process (i.e., one with a real mm_struct).
+        */
+       return (current->mm != NULL);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
@@ -331,8 +351,10 @@ static void vcpu_hotplug(unsigned int cpu)
        }
 
        if (strcmp(state, "online") == 0) {
+               cpu_set(cpu, xenbus_allowed_cpumask);
                (void)cpu_up(cpu);
        } else if (strcmp(state, "offline") == 0) {
+               cpu_clear(cpu, xenbus_allowed_cpumask);
                (void)cpu_down(cpu);
        } else {
                printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
@@ -353,6 +375,22 @@ static void handle_vcpu_hotplug_event(
        }
 }
 
+static int smpboot_cpu_notify(struct notifier_block *notifier,
+                             unsigned long action, void *hcpu)
+{
+       int cpu = (long)hcpu;
+
+       /*
+        * We do this in a callback notifier rather than __cpu_disable()
+        * because local_cpu_hotplug_request() does not work in the latter
+        * as it's always executed from within a stopmachine kthread.
+        */
+       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
+               cpu_clear(cpu, local_allowed_cpumask);
+
+       return NOTIFY_OK;
+}
+
 static int setup_cpu_watcher(struct notifier_block *notifier,
                              unsigned long event, void *data)
 {
@@ -360,7 +398,8 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
 
        static struct xenbus_watch cpu_watch = {
                .node = "cpu",
-               .callback = handle_vcpu_hotplug_event };
+               .callback = handle_vcpu_hotplug_event,
+               .flags = XBWF_new_thread };
        (void)register_xenbus_watch(&cpu_watch);
 
        if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
@@ -375,14 +414,62 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
 
 static int __init setup_vcpu_hotplug_event(void)
 {
+       static struct notifier_block hotplug_cpu = {
+               .notifier_call = smpboot_cpu_notify };
        static struct notifier_block xsn_cpu = {
                .notifier_call = setup_cpu_watcher };
+
+       register_cpu_notifier(&hotplug_cpu);
        register_xenstore_notifier(&xsn_cpu);
+
        return 0;
 }
 
 arch_initcall(setup_vcpu_hotplug_event);
 
+int smp_suspend(void)
+{
+       int i, err;
+
+       lock_cpu_hotplug();
+
+       /*
+        * Take all other CPUs offline. We hold the hotplug mutex to
+        * avoid other processes bringing up CPUs under our feet.
+        */
+       while (num_online_cpus() > 1) {
+               unlock_cpu_hotplug();
+               for_each_online_cpu(i) {
+                       if (i == 0)
+                               continue;
+                       err = cpu_down(i);
+                       if (err) {
+                               printk(KERN_CRIT "Failed to take all CPUs "
+                                      "down: %d.\n", err);
+                               for_each_cpu(i)
+                                       vcpu_hotplug(i);
+                               return err;
+                       }
+               }
+               lock_cpu_hotplug();
+       }
+
+       return 0;
+}
+
+void smp_resume(void)
+{
+       int i;
+
+       for_each_cpu(i)
+               vcpu_prepare(i);
+
+       unlock_cpu_hotplug();
+
+       for_each_cpu(i)
+               vcpu_hotplug(i);
+}
+
 int __cpu_disable(void)
 {
        cpumask_t map = cpu_online_map;
@@ -415,6 +502,20 @@ void __cpu_die(unsigned int cpu)
 
 #else /* !CONFIG_HOTPLUG_CPU */
 
+int smp_suspend(void)
+{
+       if (num_online_cpus() > 1) {
+               printk(KERN_WARNING "Can't suspend SMP guests "
+                      "without CONFIG_HOTPLUG_CPU\n");
+               return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+void smp_resume(void)
+{
+}
+
 int __cpu_disable(void)
 {
        return -ENOSYS;
@@ -429,6 +530,20 @@ void __cpu_die(unsigned int cpu)
 
 int __devinit __cpu_up(unsigned int cpu)
 {
+       int rc;
+
+       if (local_cpu_hotplug_request()) {
+               cpu_set(cpu, local_allowed_cpumask);
+               if (!cpu_isset(cpu, xenbus_allowed_cpumask)) {
+                       printk("%s: attempt to bring up CPU %u disallowed by "
+                              "remote admin.\n", __FUNCTION__, cpu);
+                       return -EBUSY;
+               }
+       } else if (!cpu_isset(cpu, local_allowed_cpumask) ||
+                  !cpu_isset(cpu, xenbus_allowed_cpumask)) {
+               return -EBUSY;
+       }
+
 #ifdef CONFIG_SMP_ALTERNATIVES
        if (num_online_cpus() == 1)
                prepare_for_smp();
@@ -436,7 +551,9 @@ int __devinit __cpu_up(unsigned int cpu)
 
        xen_smp_intr_init(cpu);
        cpu_set(cpu, cpu_online_map);
-       if (HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL) != 0)
+
+       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+       if (rc != 0)
                BUG();
 
        return 0;
index f338de1f375a147e8e330bf41e36bbe78a9584fd..c64d2500671210de7ef5bc2b9740b25b2f2288e3 100644 (file)
@@ -16,7 +16,7 @@ static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
 {
        struct pciback_device *pdev;
 
-       pdev = kmalloc(sizeof(struct pciback_device), GFP_KERNEL);
+       pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
        if (pdev == NULL)
                goto out;
        dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
index 7e343eff212e9456794e0ddfb54cdf66e633b56a..2cb2e97a879dbf5e8f5817376f870f4bb7acefa7 100644 (file)
@@ -685,6 +685,24 @@ void xs_resume(void)
        up_write(&xs_state.suspend_mutex);
 }
 
+static int xenwatch_handle_callback(void *data)
+{
+       struct xs_stored_msg *msg = data;
+
+       msg->u.watch.handle->callback(msg->u.watch.handle,
+                                     (const char **)msg->u.watch.vec,
+                                     msg->u.watch.vec_size);
+
+       kfree(msg->u.watch.vec);
+       kfree(msg);
+
+       /* Kill this kthread if we were spawned just for this callback. */
+       if (current->pid != xenwatch_pid)
+               do_exit(0);
+
+       return 0;
+}
+
 static int xenwatch_thread(void *unused)
 {
        struct list_head *ent;
@@ -707,12 +725,11 @@ static int xenwatch_thread(void *unused)
 
                if (ent != &watch_events) {
                        msg = list_entry(ent, struct xs_stored_msg, list);
-                       msg->u.watch.handle->callback(
-                               msg->u.watch.handle,
-                               (const char **)msg->u.watch.vec,
-                               msg->u.watch.vec_size);
-                       kfree(msg->u.watch.vec);
-                       kfree(msg);
+                       if (msg->u.watch.handle->flags & XBWF_new_thread)
+                               kthread_run(xenwatch_handle_callback,
+                                           msg, "xenwatch_cb");
+                       else
+                               xenwatch_handle_callback(msg);
                }
 
                mutex_unlock(&xenwatch_mutex);
index 584a5db3556224dffddf8b48507607e36569b8be..e9fd1b63ae81b96c5a17964c6322ff5f0392ffd5 100644 (file)
@@ -110,6 +110,9 @@ void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
 #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
 #endif
 
+int gnttab_suspend(void);
+int gnttab_resume(void);
+
 #endif /* __ASM_GNTTAB_H__ */
 
 /*
index 122480584ff9de98723921ca92514bf273a09277..6d69963870b52c87fdb186ea26e33267f6c18ea8 100644 (file)
@@ -55,8 +55,17 @@ struct xenbus_watch
        /* Callback (executed in a process context with no locks held). */
        void (*callback)(struct xenbus_watch *,
                         const char **vec, unsigned int len);
+
+       /* See XBWF_ definitions below. */
+       unsigned long flags;
 };
 
+/*
+ * Execute callback in its own kthread. Useful if the callback is long
+ * running or heavily serialised, to avoid taking out the main xenwatch thread
+ * for a long period of time (or even unwittingly causing a deadlock).
+ */
+#define XBWF_new_thread        1
 
 /* A xenbus device. */
 struct xenbus_device {